In [3]:
from matplotlib import pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
In [ ]:
In [2]:
kwhs = pd.read_csv("../../prop39schools/sample_data/15634040000000_2012-2013_SCE_ELECTRIC_20150722.xml_INTERVAL.csv", header=None)
In [3]:
kwhs.rename(columns={0:"id", 1:"date"}, inplace=True)
# date is in Unix timestamp format, convert to ISO format
kwhs['date'] = kwhs['date'].apply(lambda x: datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d'))
agreement_id = kwhs['id'].iloc[0]
kwhs.drop("id", axis=1, inplace=True)
kwhs.set_index(["date"], inplace=True)
In [4]:
kwhs.head()
Out[4]:
In [5]:
kwhs.shape
Out[5]:
In [6]:
(kwhs.sum(0)>0).sum()
Out[6]:
In [7]:
(kwhs.count()>0).sum()-2
Out[7]:
In [8]:
kwhs.ix['2013-06-03']
Out[8]:
In [9]:
# It looks like this is 15-minute interval data?
plt.figure(figsize=(8,4))
plt.plot(kwhs.max().values, label="max", lw=2)
plt.plot(kwhs.std().values, label="std", lw=2)
plt.plot(kwhs.mean().values, label="avg", lw=2)
idx = np.random.choice(kwhs.index.values, 5)
for i in idx:
plt.plot(kwhs.ix[i].values[0], label=i)
plt.ylabel("kWh")
plt.legend()
Out[9]:
In [10]:
idx
Out[10]:
In [11]:
plt.figure(figsize=(8,6))
plt.imshow(kwhs.iloc[:,:96].values, interpolation='nearest', aspect='auto')
Out[11]:
In [12]:
bill = pd.read_csv("../../prop39schools/sample_data/15634040000000_2012-2013_SCE_ELECTRIC_20150722.xml_BILL.csv")
bill.iloc[1]
Out[12]:
In [13]:
bill.shape
Out[13]:
In [14]:
bill['start_time'] = bill['start_time'].apply(lambda x: datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d'))
zipcode = bill['school_site_zip'].iloc[0]
In [15]:
bill['off_peak_demand'].hist(bins=20)
Out[15]:
In [16]:
bill['on_peak_demand'].hist(bins=20)
Out[16]:
In [17]:
bill.head()
Out[17]:
In [18]:
bill['school_site_name'].value_counts()
Out[18]:
In [ ]:
In [ ]: